Faceting

import numpy as np

from lets_plot import *
import pandas as pd
import vega_datasets
import seaborn as sns
LetsPlot.setup_html()
data = sns.load_dataset("penguins")
data
species island bill_length_mm bill_depth_mm flipper_length_mm body_mass_g sex
0 Adelie Torgersen 39.1 18.7 181.0 3750.0 Male
1 Adelie Torgersen 39.5 17.4 186.0 3800.0 Female
2 Adelie Torgersen 40.3 18.0 195.0 3250.0 Female
3 Adelie Torgersen NaN NaN NaN NaN NaN
4 Adelie Torgersen 36.7 19.3 193.0 3450.0 Female
... ... ... ... ... ... ... ...
339 Gentoo Biscoe NaN NaN NaN NaN NaN
340 Gentoo Biscoe 46.8 14.3 215.0 4850.0 Female
341 Gentoo Biscoe 50.4 15.7 222.0 5750.0 Male
342 Gentoo Biscoe 45.2 14.8 212.0 5200.0 Female
343 Gentoo Biscoe 49.9 16.1 213.0 5400.0 Male

344 rows × 7 columns

ggplot() + geom_point(data=data, mapping=aes(x='bill_depth_mm', y='flipper_length_mm', color='species'))
ggplot() + geom_point(
    data=data, 
    mapping=aes(
        x='bill_depth_mm', 
        y='flipper_length_mm',
    )
) + facet_grid(x='species')
ggplot() + geom_point(
    data=data, 
    mapping=aes(
        x='bill_depth_mm', 
        y='flipper_length_mm',
        color='species'
    )
) + facet_grid(x='species')
ggplot()+ geom_point(
    data=data[['bill_depth_mm', 'flipper_length_mm']], 
    mapping=aes(
        x='bill_depth_mm', 
        y='flipper_length_mm',
    ), color='lightgrey'
) + geom_point(
    data=data, 
    mapping=aes(
        x='bill_depth_mm', 
        y='flipper_length_mm',
        color='species'
    )
) + facet_grid(x='species')
ggplot()+ geom_point(
    data=data[['bill_depth_mm', 'flipper_length_mm']], 
    mapping=aes(
        x='bill_depth_mm', 
        y='flipper_length_mm',
    )
) + geom_point(
    data=data, 
    mapping=aes(
        x='bill_depth_mm', 
        y='flipper_length_mm',
        color='species'
    )
) + facet_grid(x='species', y='sex')

Visualizing many variables

ggplot() + geom_point(data=data, mapping=aes(x='bill_depth_mm', y='flipper_length_mm', size='bill_length_mm', color='body_mass_g'))
ggplot() + geom_point(
    data=data, 
    mapping=aes(
        x='bill_depth_mm', 
        y='flipper_length_mm',
        color='bill_length_mm',
        size='body_mass_g',
    )
) + facet_grid(x='species', y='sex')
melted = data.reset_index().melt(id_vars=["index", "species", "island", "sex"])
merged = melted.merge(melted,  on=["index", "species", "island", "sex"]) 

ggplot() + geom_point(
    data=merged, 
    mapping=aes(
        x='value_x', 
        y='value_y',
        color='species'
    )
) + facet_grid(x='variable_x', y='variable_y',  scales='free')
import altair as alt

for field in ['bill_depth_mm', 'flipper_length_mm', 'bill_length_mm', 'body_mass_g']:
    data[field] = (data[field] - data[field].mean()) / data[field].std()

alt.Chart(data, width=500).transform_window(
    index='count()'
).transform_fold(
    ['bill_depth_mm', 'flipper_length_mm', 'bill_length_mm', 'body_mass_g']
).mark_line().encode(
    x='key:N',
    y='value:Q',
    color='species:N',
    detail='index:N',
    opacity=alt.value(0.5)
)
import altair as alt
from vega_datasets import data

source = data.iris()

alt.Chart(source, width=500).transform_window(
    index='count()'
).transform_fold(
    ['petalLength', 'petalWidth', 'sepalLength', 'sepalWidth']
).mark_line().encode(
    x='key:N',
    y='value:Q',
    color='species:N',
    detail='index:N',
    opacity=alt.value(0.5)
)

Tiles and raster marks

from vega_datasets import data

data = data.windvectors()
data
longitude latitude dir dirCat speed
0 0.125 45.125 228 225 3.12
1 0.375 45.125 228 225 3.24
2 0.625 45.125 229 225 3.34
3 0.875 45.125 229 225 3.44
4 1.125 45.125 228 225 3.48
... ... ... ... ... ...
4795 -1.125 59.875 155 150 5.96
4796 -0.875 59.875 154 150 6.34
4797 -0.625 59.875 153 150 6.71
4798 -0.375 59.875 152 150 7.09
4799 -0.125 59.875 152 150 7.48

4800 rows × 5 columns

ggplot() + geom_point(data=data, mapping=aes(x='longitude', y='latitude', color='speed')) + ggsize(1000, 600)
ggplot() + geom_point(data=data, mapping=aes(x='longitude', y='latitude', color='speed')) + scale_color_brewer('div', palette='RdBu', direction=-1) + ggsize(1000, 600)
ggplot() + geom_tile(data=data, mapping=aes(x='longitude', y='latitude', fill='speed')) + scale_fill_brewer('div', palette='RdBu', direction=-1) + ggsize(1000, 600)
ggplot() + geom_raster(data=data, mapping=aes(x='longitude', y='latitude', fill='speed')) + scale_fill_brewer('div', palette='RdBu', direction=-1) + ggsize(1000, 600)

2-D Histograms

from vega_datasets import data

data = data.movies()
ggplot() + geom_point(data=data, mapping=aes(x='IMDB_Rating', y='Rotten_Tomatoes_Rating', color='Major_Genre', label='Name')) + ggsize(1000, 600)
ggplot() + geom_point(
    data=data[['IMDB_Rating', 'Rotten_Tomatoes_Rating']], 
    mapping=aes(x='IMDB_Rating', y='Rotten_Tomatoes_Rating'), color='lightgrey'
) + geom_point(
    data=data, 
    mapping=aes(x='IMDB_Rating', y='Rotten_Tomatoes_Rating', color='Major_Genre', label='Name')
)  + facet_wrap('Major_Genre') +  ggsize(1600, 1000)
ggplot() + geom_bin2d(data=data, mapping=aes(x='IMDB_Rating', y='Rotten_Tomatoes_Rating'), bins=[20, 20])
ggplot() + geom_bin2d(data=data, mapping=aes(x='IMDB_Rating', y='Rotten_Tomatoes_Rating'), bins=[20, 20]) \
    + coord_cartesian()
ggplot() + geom_bin2d(data=data, mapping=aes(x='IMDB_Rating', y='Rotten_Tomatoes_Rating'), bins=[20, 20]) \
    + coord_cartesian() + scale_fill_brewer('div', palette='Spectral', direction=-1) 
data
Title US_Gross Worldwide_Gross US_DVD_Sales Production_Budget Release_Date MPAA_Rating Running_Time_min Distributor Source Major_Genre Creative_Type Director Rotten_Tomatoes_Rating IMDB_Rating IMDB_Votes
0 The Land Girls 146083.0 146083.0 NaN 8000000.0 Jun 12 1998 R NaN Gramercy None None None None NaN 6.1 1071.0
1 First Love, Last Rites 10876.0 10876.0 NaN 300000.0 Aug 07 1998 R NaN Strand None Drama None None NaN 6.9 207.0
2 I Married a Strange Person 203134.0 203134.0 NaN 250000.0 Aug 28 1998 None NaN Lionsgate None Comedy None None NaN 6.8 865.0
3 Let's Talk About Sex 373615.0 373615.0 NaN 300000.0 Sep 11 1998 None NaN Fine Line None Comedy None None 13.0 NaN NaN
4 Slam 1009819.0 1087521.0 NaN 1000000.0 Oct 09 1998 R NaN Trimark Original Screenplay Drama Contemporary Fiction None 62.0 3.4 165.0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
3196 Zack and Miri Make a Porno 31452765.0 36851125.0 21240321.0 24000000.0 Oct 31 2008 R 101.0 Weinstein Co. Original Screenplay Comedy Contemporary Fiction Kevin Smith 65.0 7.0 55687.0
3197 Zodiac 33080084.0 83080084.0 20983030.0 85000000.0 Mar 02 2007 R 157.0 Paramount Pictures Based on Book/Short Story Thriller/Suspense Dramatization David Fincher 89.0 NaN NaN
3198 Zoom 11989328.0 12506188.0 6679409.0 35000000.0 Aug 11 2006 PG NaN Sony Pictures Based on Comic/Graphic Novel Adventure Super Hero Peter Hewitt 3.0 3.4 7424.0
3199 The Legend of Zorro 45575336.0 141475336.0 NaN 80000000.0 Oct 28 2005 PG 129.0 Sony Pictures Remake Adventure Historical Fiction Martin Campbell 26.0 5.7 21161.0
3200 The Mask of Zorro 93828745.0 233700000.0 NaN 65000000.0 Jul 17 1998 PG-13 136.0 Sony Pictures Remake Adventure Historical Fiction Martin Campbell 82.0 6.7 4789.0

3201 rows × 16 columns

def density(data, x, y, bw=[0.25, 2.5]):
    x_dist = ((x - data['IMDB_Rating']) / bw[0]) ** 2
    y_dist = ((y - data['Rotten_Tomatoes_Rating']) / bw[1]) ** 2
    return np.exp(-(x_dist + y_dist)).mean()

x_coords, y_coords = np.meshgrid(np.linspace(0, 10, 200), np.linspace(0, 100, 200))
xy = np.stack([x_coords.flatten(), y_coords.flatten()]).T
data = pd.DataFrame(dict(
    x=xy[:, 0],
    y=xy[:, 1],
    density=[density(data, xi, yi) for (xi, yi) in xy]
))
ggplot() + geom_raster(data=data, mapping=aes(x='x', y='y', fill='density')) + scale_fill_brewer('div', palette='RdBu', direction=-1) + ggsize(1000, 600) + coord_cartesian()
ggplot() + geom_density2df(data=data, mapping=aes(x='IMDB_Rating', y='Rotten_Tomatoes_Rating', fill='..level..'), bins=20, show_legend=True) + scale_fill_brewer('div', palette='Spectral', direction=-1)  + coord_cartesian()
ggplot() + geom_density2df(data=data, mapping=aes(x='IMDB_Rating', y='Rotten_Tomatoes_Rating', fill='..level..'), bw=[0.25, 2.5], bins=20, show_legend=True) + scale_fill_brewer('div', palette='Spectral', direction=-1)  + coord_cartesian()
ggplot() + geom_density2df(data=data, mapping=aes(x='IMDB_Rating', y='Rotten_Tomatoes_Rating', fill='..group..'), show_legend=False) + scale_fill_brewer('div', palette='Spectral', direction=-1)  + coord_cartesian()

Voronoi diagrams

from vega_datasets import data

airports = data.airports()
flights = data.flights_airport()
airports
iata name city state country latitude longitude
0 00M Thigpen Bay Springs MS USA 31.953765 -89.234505
1 00R Livingston Municipal Livingston TX USA 30.685861 -95.017928
2 00V Meadow Lake Colorado Springs CO USA 38.945749 -104.569893
3 01G Perry-Warsaw Perry NY USA 42.741347 -78.052081
4 01J Hilliard Airpark Hilliard FL USA 30.688012 -81.905944
... ... ... ... ... ... ... ...
3371 ZEF Elkin Municipal Elkin NC USA 36.280024 -80.786069
3372 ZER Schuylkill Cty/Joe Zerbey Pottsville PA USA 40.706449 -76.373147
3373 ZPH Zephyrhills Municipal Zephyrhills FL USA 28.228065 -82.155916
3374 ZUN Black Rock Zuni NM USA 35.083227 -108.791777
3375 ZZV Zanesville Municipal Zanesville OH USA 39.944458 -81.892105

3376 rows × 7 columns

ggplot() + geom_point(data=data, mapping=aes(x='longitude', y='latitude'))
counts = flights.merge(airports, left_on='origin', right_on='iata')
counts
origin destination count iata name city state country latitude longitude
0 ABE ATL 853 ABE Lehigh Valley International Allentown PA USA 40.652363 -75.440402
1 ABE BHM 1 ABE Lehigh Valley International Allentown PA USA 40.652363 -75.440402
2 ABE CLE 805 ABE Lehigh Valley International Allentown PA USA 40.652363 -75.440402
3 ABE CLT 465 ABE Lehigh Valley International Allentown PA USA 40.652363 -75.440402
4 ABE CVG 247 ABE Lehigh Valley International Allentown PA USA 40.652363 -75.440402
... ... ... ... ... ... ... ... ... ... ...
5361 YUM IPL 326 YUM Yuma MCAS-Yuma International Yuma AZ USA 32.656583 -114.605972
5362 YUM LAS 99 YUM Yuma MCAS-Yuma International Yuma AZ USA 32.656583 -114.605972
5363 YUM LAX 1044 YUM Yuma MCAS-Yuma International Yuma AZ USA 32.656583 -114.605972
5364 YUM PHX 1961 YUM Yuma MCAS-Yuma International Yuma AZ USA 32.656583 -114.605972
5365 YUM SLC 440 YUM Yuma MCAS-Yuma International Yuma AZ USA 32.656583 -114.605972

5366 rows × 10 columns

counts = counts[['origin', 'count', 'latitude', 'longitude']]
counts = counts.groupby('origin').agg({
    'count': 'sum',
    'latitude': 'first',
    'longitude': 'first',
})
counts
count latitude longitude
origin
ABE 4807 40.652363 -75.440402
ABI 2660 32.411320 -99.681897
ABQ 41146 35.040222 -106.609194
ABY 1095 31.535515 -84.194473
ACK 457 41.253052 -70.060181
... ... ... ...
WYS 264 44.688399 -111.117638
XNA 14112 36.281869 -94.306811
YAK 725 59.503361 -139.660226
YKM 340 46.568170 -120.544059
YUM 3871 32.656583 -114.605972

303 rows × 3 columns

counts.query('(count > 2000) & (latitude > 25) & (latitude < 50)')
count latitude longitude
origin
ABE 4807 40.652363 -75.440402
ABI 2660 32.411320 -99.681897
ABQ 41146 35.040222 -106.609194
ACV 3714 40.978115 -124.108619
AEX 2330 31.327372 -92.548556
... ... ... ...
TVC 4262 44.741445 -85.582235
TYS 14004 35.812487 -83.992856
VPS 6820 30.483250 -86.525400
XNA 14112 36.281869 -94.306811
YUM 3871 32.656583 -114.605972

194 rows × 3 columns

ggplot() + geom_point(data=counts.query('(count > 2000) & (latitude > 25) & (latitude < 50)'), mapping=aes(x='longitude', y='latitude'))
from scipy.spatial import Voronoi

v = Voronoi(points = counts[['longitude', 'latitude']].values)
v_df = pd.DataFrame([(i, *v.vertices[v_id]) for i, r in enumerate(v.regions) \
                                            for v_id in r if any(r) and not -1 in r],
                    columns=['id', 'x', 'y'])

ggplot() + \
    geom_polygon(aes(x='x', y='y', group='id', fill='id'), \
                 data=v_df, show_legend=False, color='black', alpha=0.) + \
    geom_point(aes(x='longitude', y='latitude'), data=counts, shape=21, color='black', fill='white') + \
    scale_fill_discrete() + scale_x_continuous(limits=[-125, -70]) + scale_y_continuous(limits=[25, 50]) + ggsize(1200, 800)
v.vertices[v.ridge_vertices].shape
(897, 2, 2)
v.vertices[0]
array([-122.52061032,  -13.33427622])
from scipy.spatial import Voronoi

v = Voronoi(points = counts[['longitude', 'latitude']].values)
v_df = pd.DataFrame([(v.vertices[r[0]].tolist() + v.vertices[r[1]].tolist()) for i, r in enumerate(v.ridge_vertices) if any(r) and not -1 in r],
                    columns=['x1', 'y1', 'x2', 'y2'])

ggplot() + \
    geom_segment(aes(x='x1', y='y1', xend='x2', yend='y2',), \
                 data=v_df, show_legend=False, color='black') + \
    geom_point(aes(x='longitude', y='latitude'), data=counts, shape=21, color='black', fill='white') + \
    scale_fill_discrete() + scale_x_continuous(limits=[-125, -70]) + scale_y_continuous(limits=[25, 50]) + ggsize(1200, 800)
v_df
x1 y1 x2 y2
0 -162.443216 63.804203 -157.848949 64.376560
1 -162.443216 63.804203 -169.585957 56.901014
2 -157.848949 64.376560 -165.377812 49.900827
3 -169.585957 56.901014 -165.377812 49.900827
4 -156.615711 65.020800 -157.848949 64.376560
... ... ... ... ...
883 -90.176172 43.823228 -89.330875 43.993172
884 -88.647938 42.812487 -88.565038 43.432585
885 -91.127222 45.568721 -88.856878 45.038552
886 -88.306081 45.632959 -88.755175 45.108732
887 -87.172633 43.843489 -87.947783 43.726042

888 rows × 4 columns

ggplot() + geom_point(data=counts.query('(count > 2000) & (latitude > 25) & (latitude < 50)'), mapping=aes(x='longitude', y='latitude')) + geom_segment()